#-*- codeing = utf-8 -*-
#@Time : 2020/8/31 12:56
#@Author : 阳某
#@File : 动物科学学院.py
#@Software : PyCharm

import requests
import re
from fake_useragent import UserAgent
from lxml import etree
from openpyxl import Workbook
wb = Workbook()
wa = wb.active
headers = {
    'user-agent':UserAgent().chrome,
}

urls = [
        'http://jgxy.bjut.edu.cn/szdw/qtjsml/201579/19707_1.html',
]
for url in urls:
    requests.packages.urllib3.disable_warnings()
    resp = requests.get(url,headers = headers,verify = False,timeout = 5)
    resp.encoding = resp.apparent_encoding
    print(resp.status_code)
    # print(resp.text)
    e = etree.HTML(resp.text)
    url_li = e.xpath('//div[@class="zhengwen"]//a/@href')    #...........1
    url_li = list(set(url_li))                  #............2
    print(url_li)
    for u in range(len(url_li)):
        line = []
        print('正在爬第%s个链接' % (u + 1))
        url1 = url_li[u]
        # url1 = 'http://www.riit.tsinghua.edu.cn' + url_li[u][:]  # ...............3
        print(url1)
        requests.packages.urllib3.disable_warnings()
        try:
            res = requests.get(url1,headers = headers,verify = False,timeout = 3)
            print(res.status_code)
        except:
            print("超时了")
            continue
        res.encoding = res.apparent_encoding
        emai = re.findall(
            r"[\w!#$%&'*+/=?^_`{|}~-]+(?:\.[\w!#$%&'*+/=?^_`{|}~-]+)*@(?:[\w](?:[\w-]*[\w])?\.)+[\w](?:[\w-]*[\w])?",res.text,re.S)
        print(emai)
        e = etree.HTML(res.text)
        if len(emai)>=1:
            emai = re.findall(
                r"[\w!#$%&'*+/=?^_`{|}~-]+(?:\.[\w!#$%&'*+/=?^_`{|}~-]+)*@(?:[\w](?:[\w-]*[\w])?\.)+[\w](?:[\w-]*[\w])?",
                res.text, re.S)[0]
            try:
                name = e.xpath('//div[@class="zhengwen"]/p/strong/text()')
                if len(name)==0:
                    name = e.xpath('//div[@class="zhengwen"]/p[2]/text()')[0].strip()
                else:
                    name = e.xpath('//div[@class="zhengwen"]/p/strong/text()')[0].strip()  # ....4
                line.append(name)
                line.append(emai)
                print(line)
                wa.append(line)
            except:
                print("错了")
        else:
            print('没有邮箱')
wb.save('./建筑工程学院.xlsx')